Last modification: 19 May, 2020
@EricJCGalvez
# check.packages function: install and load multiple R packages.
# Check to see if packages are installed. Install them if they are not, then load them into the R session.
check.packages <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE, warn.conflicts = FALSE, quietly=TRUE)
}
# Usage example
packages<-c("pcaExplorer", "tidyverse", "DESeq2", "dplyr",
"ggthemes", "RColorBrewer", "knitr", "ggpubr",
"gplots", "genefilter", "ComplexHeatmap", "circlize",
"stringr", "here")
check.packages(packages)## Loading required package: rtracklayer
## RNAseq counts and genome annotation path
############################################################
dir_counts <- here("data/")
strain <- c("PROD")
straingff <- "../genome_refs/PROD/PROD_nofasta.gff"
### select pair wise contrast
############################################################
control_con <- c("PROD-invitro")
exp_con1 <- c("PROD-eSPF")
exp_con2 <- c("PROD-Conv")sampleFiles <- grep(strain, list.files(dir_counts), value=TRUE)
sampleCondition <- stringr::str_match(sampleFiles, "_(.*?).counts") %>%
gsub("(r.)_", "",. )
sampleCondition <- sampleCondition[,2]
sampleTable <- data.frame(sampleName = sampleFiles,
fileName = sampleFiles,
condition = sampleCondition)
sampleTable$sampleName <- gsub(".counts", "", sampleTable$sampleName)
sampleTableddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable,
directory = dir_counts,
design= ~ 1)
ddsHTSeq
dds <- estimateSizeFactors(ddsHTSeq)
dds$condition<- relevel(dds$condition, ref = control_con)
ddsFull <- DESeq(dds)
res <- results( ddsFull )
boxplot(log10(assays(ddsFull)[["cooks"]]), range=0, las=2)rldFull <- DESeq2::rlog( ddsFull )
#head( assay(rld) )
sampleDists <- dist( t( assay(rldFull) ) )
#sampleDists
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( rldFull$Sample_labs,
rldFull$Treatment, sep="-" )
colnames(sampleDistMatrix) <- NULL
colours = colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
heatmap.2( sampleDistMatrix, trace="none", col=colours, labRow= rldFull$condition,
labCol= rldFull$condition, cexRow = 0.8, cexCol = 0.8, srtCol=45, margins =c(6.5,8) )pcaData <- plotPCA(rldFull, intgroup=c("condition"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- ggplot(pcaData, aes(PC1, PC2, color=condition, shape=condition)) +
geom_point(size=6, alpha=0.7) +
scale_color_gdocs() +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
scale_shape_manual(values = c(18 : 21)) +
geom_vline(xintercept = 0, linetype="dashed", color = "black", size=0.2) +
geom_hline(yintercept = 0, linetype="dashed", color = "black", size=0.2)
p + theme_bw(20)Loop
# Subsampling a specific treatment
exps <- c(exp_con1, exp_con2)
pcals <- list()
pcaloads <- list()
top30_loads <- list()
mals <- list()
deseq_mat <- list()
#i=exp_con1
#source('~/.active-rstudio-document')for(i in unique(exps)) {
## subset function, filtering and deseq execution
dds_ko10 <- select_dds_pair(x = ddsFull, control = control_con, expcond = i,
minreads = 10, minsamples = 4)
## extract results and rld transformation for PCA
contrast_name <- DESeq2::resultsNames(dds_ko10)[2]
res_exp <- DESeq2::results(dds_ko10, contrast=c("condition", i, control_con)) ## ORDEN matters !!
rld <- DESeq2::rlog(dds_ko10)
## PCAs
##################################################
pcals[[i]] <- pcaExplorer::pcaplot(rld, intgroup = c("condition"), ntop = 1000,
pcX = 1, pcY = 2, title = contrast_name, text_labels = F,
ellipse = TRUE, ) + scale_fill_manual(values = c( "#B31B21", "steelblue")) +
scale_color_manual(values = c( "#B31B21", "steelblue"))
# pcaplot3d(rld, intgroup = c("condition"),ntop = 1000,
# pcX = 1, pcY = 2, pcZ = 3)
## hi_loadings top 10 signatures
#################################################
pcaobj <- prcomp(t(assay(rld)))
pcaloads[[i]]<- hi_loadings_plot(pcaobj, topN = 10) #annotation = annotation
## getting gff features
#################################################
gene_product_df <- get_genproducts(straingff)
## Top 30 up and 30down geneSig from PCA
## making rownames as a column in a dataframe
df <- as.data.frame(get_top_pca_genes(pcaobj, topN = 30, whichpc = 1))
df <- data.table::setDT(df, keep.rownames = TRUE)[]
colnames(df) <- c("gene_id", "pca_loadings")
gene_df <- as.data.frame(assay(dds_ko10))
gene_df <- data.table::setDT(gene_df, keep.rownames = TRUE)[]
colnames(gene_df)[1] <- c("gene_id")
all_genes_norm <- merge(gene_df, gene_product_df, by="gene_id") %>%
select(gene_id, product, 2:ncol(gene_df))
top30_loads[[i]] <- all_genes_norm %>%
filter(., gene_id %in% df$gene_id)
##### Plot MA function
###################################
## DEseq
###################################
## mergind results with annotation
diff_express=res_exp
diff_express$GENE <- gene_product_df$GeneID[match(row.names(diff_express),
row.names(gene_product_df))]
diff_express$Name <- gene_product_df$Name[match(row.names(diff_express),
row.names(gene_product_df))]
plot_lab <- paste0(i, " -> ", control_con)
color_palette <- c("#B31B21", "#1465AC", "#f7f7f7")
mals[[i]] <- MA_plot(x = diff_express, color=color_palette, main_lab = plot_lab, gename = "Name")
#import 8 x 9
## DEseq MAtrix
diff_express_df <- as.data.frame(diff_express) %>%
select(GENE, Name, 1:ncol(diff_express))
deseq_mat[[i]] <- diff_express_df
}############################################################3
#eport 4X4
DT::datatable(top30_loads[[exp_con1]], filter = 'top',
extensions = 'Buttons', options = list(
#dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel'),
pageLength = 10, autoWidth = T)
)DT::datatable(top30_loads[[exp_con2]], filter = 'top',
extensions = 'Buttons', options = list(
#dom = 'Bfrtip',
buttons = c('copy', 'csv', 'excel'),
pageLength = 10, autoWidth = T)
)deseqres_ls <- list()
for(i in unique(exps)) {
## subset function, filtering and deseq execution
dds_ko10 <- select_dds_pair(x = ddsFull, control = control_con, expcond = i,
minreads = , minsamples =0)
## extract results and rld transformation for PCA
contrast_name <- DESeq2::resultsNames(dds_ko10)[2]
res_exp <- DESeq2::results(dds_ko10, contrast=c("condition", i, control_con)) ## ORDEN matters !!
deseqres_ls[[i]] <- as.data.frame(res_exp)
}## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not an warning or error]
## fitting model and testing
dt <- bind_cols(deseqres_ls)
dt_f <- dt %>%
mutate(., MeanExp=abs(.$baseMean-.$baseMean1/2),
gene_id= rownames(deseqres_ls[[1]])) %>%
filter(., MeanExp >= 10) %>%
as.data.frame()
dt_f <- merge(dt_f, gene_product_df, by="gene_id") %>%
select(gene_id, Name, product, MeanExp, 1:ncol(gene_df))
dt_f <- dt_f %>%
mutate(Highlights=ifelse(abs(dt_f$log2FoldChange - dt_f$log2FoldChange1) >= 2,
TRUE,
ifelse(rowMeans(dt_f[c("log2FoldChange", "log2FoldChange1")]) >= 2,
TRUE,
FALSE)))p <- ggplot(dt_f, aes(x = log2FoldChange1, y = log2FoldChange, label=Name,
size = MeanExp) ) +
geom_point(color = ifelse(abs(dt_f$log2FoldChange - dt_f$log2FoldChange1) >= 2,
"red",
ifelse(rowMeans(dt_f[c("log2FoldChange", "log2FoldChange1")]) >= 2,
"blue",
"black")), alpha= 0.3) +
geom_vline(xintercept = 0, linetype="dashed", color = "blue", size=0.2) +
geom_hline(yintercept = 0, linetype="dashed", color = "blue", size=0.2) +
#xlim(-10, NA) +
#ylim(-10, NA) +
xlab("FC(eSPF vs invitro)") +
ylab("FC(Conv vs invitro)") + theme_bw(24)
p + ggrepel::geom_text_repel(data = dt_f[dt_f$Highlights, ],
aes(label = Name), size = 3.5, box.padding = unit(0.1,
"lines"), point.padding = unit(0.1, "lines"),
segment.size = 0.5)## Warning: Removed 237 rows containing missing values (geom_text_repel).
p <- ggplot(dt_f, aes(x = log2FoldChange1, y = log2FoldChange, label=gene_id,
size = MeanExp) ) +
geom_point(color = ifelse(abs(dt_f$log2FoldChange - dt_f$log2FoldChange1) >= 2,
"red",
ifelse(rowMeans(dt_f[c("log2FoldChange", "log2FoldChange1")]) >= 2,
"blue",
"black")), alpha= 0.3) +
geom_vline(xintercept = 0, linetype="dashed", color = "blue", size=0.2) +
geom_hline(yintercept = 0, linetype="dashed", color = "blue", size=0.2) +
#xlim(-10, NA) +
#ylim(-10, NA) +
xlab("FC(eSPF vs invitro)") +
ylab("SLR(Conv vs invitro)") + theme_bw(24)
plotly::ggplotly(p)#### recapitulate all results datafrme
res_tax <- merge(as.data.frame(res), as.data.frame(counts(ddsFull, normalized=T)), by='row.names', sort=F)
names(res_tax)[1] <- 'gene_id'
res_tax$Name=gene_product_df$Name[match(res_tax$gene_id, gene_product_df$gene_id)]dt1 <- deseq_mat[[exp_con1]]
dt2 <- deseq_mat[[exp_con2]]
dt3 <- res_tax
genes_heatmap <- get_heatmap_IDs(dt1, dt2, dt3, topNb = 25)## Warning: arrange_() is deprecated.
## Please use arrange() instead
##
## The 'programming' vignette or the tidyeval book can help you
## to program with arrange() : https://tidyeval.tidyverse.org
## This warning is displayed once per session.
## this is what the function does
pheatmap:::scale_rows
function (x)
{
m = apply(x, 1, mean, na.rm = T)
s = apply(x, 1, sd, na.rm = T)
return((x - m)/s)
}
mat <- pheatmap:::scale_rows(assay(rldFull)[genes_heatmap$gene_id, ])
type <- sampleCondition
ha = HeatmapAnnotation(df = data.frame(type = sampleCondition), name = "Experiment")
col_fun = circlize::colorRamp2(c(-2, 0, 2), c("#92c5de", "#ffffe5", "#b2182b"))
ComplexHeatmap::Heatmap(mat, name = "expression", top_annotation = ha, column_split = type, col = col_fun,
cluster_rows = TRUE, show_row_names = FALSE,
show_column_names = FALSE, border = TRUE,
row_title_rot = 0, row_title_gp = gpar(fontsize = 8)) +
rowAnnotation(link = anno_mark(at = which(!is.na(genes_heatmap$Name)),
labels = genes_heatmap$Name[ which(!is.na(genes_heatmap$Name))],
labels_gp = gpar(fontsize = 8), padding = 0.5))